Just checking my ability to load the AHP data and see overall structure with a PCA
library(tidyverse)
── Attaching core tidyverse packages ─────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.2 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.2 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.1 ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(metamoRph)
proteome_df <- data.table::fread('../data/AhpDb_PsmMatrix.csv')
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Warning: Discarded single-line footer: <<(18869 rows affected)>>
proteome_df <- proteome_df[-1,] # remove visual formatting line
# make numeric
proteome_matrix <- proteome_df[,-1] %>% as.matrix()
mode(proteome_matrix) = "numeric"
Warning: NAs introduced by coercion
row.names(proteome_matrix) <- proteome_df$Accession
proteome_meta <- read_delim('../data/AhpDb_ProteinSummary.csv', delim = 'asdf')
Rows: 1649 Columns: 1── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "asdf"
chr (1): UniProt_Id ,Gene_Symbol ...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(proteome_meta) <- 'one'
# some wacky stuff to handle the ... interesting ... data format
proteome_meta <- proteome_meta %>%
mutate_if(is.character, utf8::utf8_encode) %>%
separate(one, into =
c("UniProt_Id", "Gene_Symbol", "Protein_Name", "Gene_Names", "Accession", "PsmSum", "MeanPsm", "PsmPresent", "PctPresent"),
sep = '\\s\\s+,|,\\s\\s+')
Warning: Expected 9 pieces. Missing pieces filled with `NA` in 2 rows [1, 1649].
# first row is visual formatting
# last row is empty
proteome_meta <- proteome_meta[-c(1,1649),]
sample_meta <- read_csv('../data/AhpDb_ClinicalData.csv')
Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 313 Columns: 62── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (62): SampleId, CD, Sex, Age, Race, Ethnicity, Iop, IopMethod, OdOs, Incisional, Smoking, Htn, Cvd, Cva, CollagenVd, Diabetes, OtherSystemicDiseases, G...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sample_meta <- sample_meta[-c(1,314),]
Should remove the handful of samples with a PSM sum less than 2000 and sum normalize (a la TPM)
proteome_matrix %>% as_tibble() %>% select(contains("AH")) %>% colSums(na.rm = TRUE) %>% density() %>% plot()
proteome_matrix %>% as_tibble() %>% select(contains("AH")) %>% colSums(na.rm = TRUE) %>% summary()
Min. 1st Qu. Median Mean 3rd Qu. Max.
687 10093 11469 11783 13115 23364
remove_id <- proteome_matrix %>% as_tibble() %>% select(contains("AH")) %>% colSums(na.rm = TRUE) %>% enframe() %>% filter(value < 2000)
proteome_tib <- proteome_matrix %>% as_tibble(rownames = 'Accession') %>% select(-contains(remove_id$name))
proteome_tib[is.na(proteome_tib)] <- 0
First 100 samples
VERY long right tail
proteome_tib[,1:30] %>% pivot_longer(-Accession) %>%
ggplot(aes(y=name,x=value)) +
ggridges::geom_density_ridges2() +
cowplot::theme_cowplot()
STILL crazy right tail. Essentially virtually all the data is near zero
proteome_tib[,1:30] %>% pivot_longer(-Accession) %>%
ggplot(aes(y=name,x=log1p(value))) +
ggridges::geom_density_ridges2() +
ylab("Sample") +
cowplot::theme_cowplot()
Summary of row sums (summing all counts for each protein by sample)
proteome_tib[,-1] %>% rowSums(na.rm = TRUE) %>% summary()
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.0 3.0 9.0 397.2 21.0 2572952.0
STILL STILL crazy right tail, even after removing all proteins with a sum < 311 (essentially average 1 count across the 311 samples)
proteome_tib[proteome_tib[,-1] %>% rowSums(na.rm = TRUE) > 311,1:30] %>%
pivot_longer(-Accession) %>%
ggplot(aes(y=name,x=log1p(value))) +
ggridges::geom_density_ridges2() +
ylab("Sample") +
cowplot::theme_cowplot()
Eight samples are REALLY different than the rest
proteome_mat_clean <- proteome_tib %>% select(contains("AH")) %>% as.matrix()
row.names(proteome_mat_clean) <- proteome_tib$Accession
aligned_sample_meta <- colnames(proteome_mat_clean) %>% enframe(value = 'SampleId') %>% select(-name) %>%
left_join(sample_meta)
Joining with `by = join_by(SampleId)`
proteome_mat_clean[is.na(proteome_mat_clean)] <- 0
pca_mm <- run_pca(proteome_mat_clean,aligned_sample_meta)
Sample CPM scaling
log1p scaling
prcomp start
plot <- pca_mm$PCA$x %>% as_tibble(rownames = 'SampleId') %>%
left_join(pca_mm$meta, by = 'SampleId') %>%
ggplot(aes(x=PC1,y=PC2,color = CD, label = SampleId)) +
geom_point() +
scale_color_manual(values = pals::alphabet() %>% unname()) +
cowplot::theme_cowplot()
plotly::ggplotly(plot)
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Removing those eight samples and AH602 (top left) and redoing the PCA
Looks reasonable now
proteome_mat_clean_filter <- proteome_mat_clean[,pca_mm$PCA$x[,c('PC1','PC2')] %>% as_tibble(rownames = 'sample') %>% filter(PC1 < 20, PC2 < 20) %>% pull(sample)]
aligned_sample_meta <- colnames(proteome_mat_clean_filter) %>% enframe(value = 'SampleId') %>% select(-name) %>%
left_join(sample_meta)
Joining with `by = join_by(SampleId)`
pca_mm <- run_pca(proteome_mat_clean_filter,aligned_sample_meta)
Sample CPM scaling
log1p scaling
prcomp start
plot <- pca_mm$PCA$x %>% as_tibble(rownames = 'SampleId') %>%
left_join(pca_mm$meta, by = 'SampleId') %>%
ggplot(aes(x=PC1,y=PC2,color = CD, label = SampleId)) +
geom_point() +
scale_color_manual(values = pals::alphabet() %>% unname()) +
cowplot::theme_cowplot()
plotly::ggplotly(plot)
save(proteome_mat_clean_filter, aligned_sample_meta, proteome_meta, file = '../data/ahpdb_cleaned_data.Rdata')
sessionInfo()
R version 4.3.0 (2023-04-21)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Ventura 13.4.1
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/New_York
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] metamoRph_0.2.0 lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0 dplyr_1.1.2 purrr_1.0.1 readr_2.1.4 tidyr_1.3.0 tibble_3.2.1
[10] ggplot2_3.4.2 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] bitops_1.0-7 rlang_1.1.1 magrittr_2.0.3 matrixStats_1.0.0 ggridges_0.5.4
[6] compiler_4.3.0 DelayedMatrixStats_1.21.0 vctrs_0.6.3 maps_3.4.1 fastmap_1.1.1
[11] pkgconfig_2.0.3 crayon_1.5.2 ellipsis_0.3.2 XVector_0.40.0 scuttle_1.9.4
[16] labeling_0.4.2 utf8_1.2.3 tzdb_0.4.0 bit_4.0.5 xfun_0.40
[21] bluster_1.9.1 zlibbioc_1.46.0 pals_1.7 beachmat_2.15.0 jsonlite_1.8.7
[26] GenomeInfoDb_1.36.1 DelayedArray_0.26.7 BiocParallel_1.33.11 irlba_2.3.5.1 parallel_4.3.0
[31] cluster_2.1.4 R6_2.5.1 stringi_1.7.12 limma_3.56.1 GenomicRanges_1.52.0
[36] Rcpp_1.0.11 SummarizedExperiment_1.30.2 knitr_1.43 IRanges_2.34.1 Matrix_1.5-4.1
[41] igraph_1.4.3 timechange_0.2.0 tidyselect_1.2.0 yaml_2.3.7 rstudioapi_0.14
[46] dichromat_2.0-0.1 abind_1.4-5 codetools_0.2-19 lattice_0.21-8 Biobase_2.60.0
[51] withr_2.5.0 pillar_1.9.0 MatrixGenerics_1.12.3 stats4_4.3.0 plotly_4.10.1
[56] generics_0.1.3 vroom_1.6.3 RCurl_1.98-1.12 S4Vectors_0.38.1 hms_1.1.3
[61] sparseMatrixStats_1.11.1 munsell_0.5.0 scales_1.2.1 glue_1.6.2 metapod_1.7.0
[66] mapproj_1.2.11 lazyeval_0.2.2 tools_4.3.0 BiocNeighbors_1.17.1 data.table_1.14.8
[71] ScaledMatrix_1.8.1 locfit_1.5-9.7 scran_1.27.1 cowplot_1.1.1 grid_4.3.0
[76] crosstalk_1.2.0 edgeR_3.42.2 colorspace_2.1-0 SingleCellExperiment_1.22.0 GenomeInfoDbData_1.2.10
[81] BiocSingular_1.15.0 cli_3.6.1 rsvd_1.0.5 fansi_1.0.4 viridisLite_0.4.2
[86] S4Arrays_1.0.5 gtable_0.3.3 digest_0.6.33 BiocGenerics_0.46.0 dqrng_0.3.0
[91] htmlwidgets_1.6.2 farver_2.1.1 htmltools_0.5.5 lifecycle_1.0.3 httr_1.4.6
[96] statmod_1.5.0 bit64_4.0.5